# Load packages
library(here)
library(dplyr)
library(readr)
library(rstan)
library(bayesrules)
library(tidyverse)
library(bayesplot)
library(rstanarm)
library(janitor)
library(tidybayes)
library(broom.mixed)
library(here)
library(sf)
library(tidycensus)
library(openxlsx)
library(s2)
library(nycgeo)
library(CARBayes)
library(spData)
library(spdep)
nyc_join <- merge(nta_sf,nta_acs_data)
nyc_join <- nyc_join %>%
st_transform(., 4269)
county_list <- nyc_join %>% pull(county_name) %>% unique()
census_api_key("0cc07f06386e317f312adef5e0892b0d002b7254")
census_data <- get_acs(state = "NY",
county = c(county_list),
geography = "tract",
variables = c(gini_inequality ="B19083_001"),
year = 2019,
output = "wide",
survey = "acs5",
geometry = TRUE) %>%
dplyr::select(-c(NAME, ends_with("M"))) %>%
rename_at(vars(ends_with("E")), .funs = list(~str_sub(., end = -2))) %>%
st_transform(., 4269) %>%
dplyr::select(-GEOID)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|================ | 22%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|==================== | 28%
|
|===================== | 30%
|
|====================== | 31%
|
|======================== | 34%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================= | 42%
|
|===================================== | 52%
|
|========================================= | 58%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================== | 66%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================== | 71%
|
|===================================================== | 76%
|
|======================================================== | 80%
|
|========================================================== | 82%
|
|======================================================================| 100%
# themes
theme_set(theme_minimal())
vari_names <- read_csv(here("clean_data", "nyc_names.csv"))
nyc_clean <- st_read(here("clean_data", "nyc_data.shp"), crs = 4269, quiet=T)
colnames(nyc_clean) <- colnames(vari_names)
library(openxlsx)
nta_to_census <- openxlsx::read.xlsx(here("ethnic", "Data", "census_to_nta.xlsx")) %>%
dplyr::select(BoroName, NTACode) %>%
rename(borough = BoroName,
nta_id = NTACode) %>%
unique()
nyc_clean <- nyc_clean %>%
merge(., nta_to_census, by="nta_id") %>%
mutate(transportation_desert_4cat = case_when(
transportation_desert_4cat==1 ~ "Poor",
transportation_desert_4cat ==2 ~ "Limited",
transportation_desert_4cat ==3 ~ "Satisfactory",
TRUE ~ "Excellent",
)) %>%
mutate(transportation_desert_4cat = factor(transportation_desert_4cat, levels=c("Poor", "Limited", "Satisfactory", "Excellent")))
assault <- st_read("/Users/freddy/Downloads/NYPD Arrest Data (Year to Date)/geo_export_a659754f-6263-4ba1-8a58-72dca5befa79.shp") %>%
filter(str_detect(ofns_desc, "FELONY ASSAULT")) %>%
filter(!(str_detect(ofns_desc, "POLICE"))) %>%
filter(str_detect(pd_desc, "2") | str_detect(pd_desc, "1") ) %>%
dplyr::select(arrest_key,geometry) %>%
st_transform(., 4269)
## Reading layer `geo_export_a659754f-6263-4ba1-8a58-72dca5befa79' from data source
## `/Users/freddy/Downloads/NYPD Arrest Data (Year to Date)/geo_export_a659754f-6263-4ba1-8a58-72dca5befa79.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 115299 features and 19 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -74.25184 ymin: 40.4994 xmax: -73.703 ymax: 40.91272
## Geodetic CRS: WGS84(DD)
sex <- st_read("/Users/freddy/Downloads/NYPD Arrest Data (Year to Date)/geo_export_a659754f-6263-4ba1-8a58-72dca5befa79.shp") %>%
filter(str_detect(ofns_desc, "SEX")) %>%
dplyr::select(arrest_key,geometry) %>%
st_transform(., 4269)
## Reading layer `geo_export_a659754f-6263-4ba1-8a58-72dca5befa79' from data source
## `/Users/freddy/Downloads/NYPD Arrest Data (Year to Date)/geo_export_a659754f-6263-4ba1-8a58-72dca5befa79.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 115299 features and 19 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -74.25184 ymin: 40.4994 xmax: -73.703 ymax: 40.91272
## Geodetic CRS: WGS84(DD)
assault_neighborhood <- st_join(nyc_clean, assault, left = TRUE) %>%
group_by(nta_id) %>%
summarize(assault_count=n()) %>%
as.tibble()
sex_neighborhood <- st_join(nyc_clean, sex, left = TRUE) %>%
group_by(nta_id) %>%
summarize(sexcrime_count=n()) %>%
as.tibble()
gini_neighborhood <- st_join(nyc_clean, census_data,left = TRUE) %>%
group_by(nta_id) %>%
summarize(gini_neighborhood=mean(gini_inequality, na.rm=T)) %>%
as.tibble() %>%
dplyr::select(nta_id, gini_neighborhood)
assault_gini <- left_join(assault_neighborhood, gini_neighborhood,by="nta_id") %>%
mutate(gini = gini_neighborhood, .before=3)%>%
dplyr::select(-gini_neighborhood) %>%
dplyr::select(-geometry)
sex_assault_gini <- left_join(assault_gini, sex_neighborhood,by="nta_id") %>%
mutate(sex_crime_count = sexcrime_count, .before=3)%>%
dplyr::select(-sexcrime_count) %>%
dplyr::select(-geometry)
nyc_clean <- nyc_clean %>%
as.tibble() %>%
filter(nta_id %in% sex_assault_gini$nta_id) %>%
left_join(., sex_assault_gini, by="nta_id")%>%
unique() %>%
st_as_sf()
subway_stations <- st_read(here("ethnic","Data","stations", "geo_export_85568705-efba-4456-bdc0-3d70ff2cf8e5.shp"), quiet=T) %>%
st_transform(., 4269)
bus_stations <- st_read(here("ethnic","Data","bus", "bus_stops_nyc_may2020.shp"), quiet=T) %>%
st_transform(., 4269)%>%
filter(str_detect(NAMELSAD, "Richmond", negate=T))
transit_points <- read_csv(here("transit","ridership_points.csv"))%>%
separate(Position, into=c("Point", "longitude", "latitude"), " ") %>%
mutate(latitude = str_remove_all(latitude, "[)]"),
longitude = str_remove_all(longitude, "[()]"),
) %>%
dplyr::select(-c(Point)) %>%
mutate(latitude = as.numeric(latitude),
longitude = as.numeric(longitude)) %>%
st_as_sf(coords = c("longitude", "latitude"), crs = 4269)
#plot locations over map
subway_loc <- ggplot() +
geom_sf(data = nyc_clean, fill = "#EBF6FF", color = "#D48DD8", size = 0.15, alpha = .8) +
geom_sf(data = subway_stations, color="#3F123C", size=1) +
coord_sf(datum = st_crs(subway_stations)) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Subway Stop Locations \nin NYC")+
theme(#panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 30, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
bus_loc <- ggplot() +
geom_sf(data = nyc_clean, fill = "#EBF6FF", color = "#D48DD8", size = 0.15, alpha = .8) +
geom_sf(data = bus_stations, color="#3F123C", size=.5, alpha=.5) +
coord_sf(datum = st_crs(subway_stations)) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Bus Stop Locations \nin NYC")+
theme(#panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 30, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
stops <- nyc_clean %>%
ggplot() +
geom_sf(aes(fill = sub_count), color = "#8f98aa") +
scale_fill_gradient(low= "lavender", high = "maroon",
guide = guide_legend(title = "Number of Subway Stops") ,na.value="#D6D6D6") +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Subway Stop Counts \nin NYC")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 30, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
bus_stops <- nyc_clean %>%
ggplot() +
geom_sf(aes(fill = bus_count), color = "#8f98aa") +
scale_fill_gradient(low= "lavender", high = "maroon",
guide = guide_legend(title = "Number of Bus Stops") ,na.value="#D6D6D6") +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Bus Stop Counts \nin NYC")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 30, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
ridership <- nyc_clean%>%
ggplot() +
geom_sf(aes(fill = log2(mean_ridership)), color = "#8f98aa") +
scale_fill_gradient(low= "lavender", high = "maroon",
guide = guide_legend(title = "Log2 Mean Ridership") ,na.value="#D6D6D6") +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Mean (Log2) Subway Turnstile \nRidership in 2018 \nfor NYC")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 30, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
access <- nyc_clean %>%
ggplot() +
geom_sf(aes(fill = transportation_desert_4cat), color = "#8f98aa") +
scale_fill_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility Category"), na.value="#D6D6D6") +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Subway Deserts \nin NYC")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 30, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
`
red <- ggplot(nyc_clean) +
geom_sf(aes(fill = below_poverty_line_count), color = "#8f98aa") +
scale_fill_gradient(low = "#FCF5EE", high = "#E13728", guide = guide_legend(title = "Number Below \nPoverty Line")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Impoverishement")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
yellow <- ggplot(nyc_clean) +
geom_sf(aes(fill = mean_income), color = "#8f98aa") +
scale_fill_gradient(low = "#FCF5EE", high = "#F3D24E", guide = guide_legend(title = "Mean Income")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Mean Income")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
teal <- ggplot(nyc_clean) +
geom_sf(aes(fill = mean_rent), color = "#8f98aa") +
scale_fill_gradient(low = "#FCF5EE", high = "#2DBDC7", guide = guide_legend(title = "Dollars")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Mean Rent")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
purple <- ggplot(nyc_clean) +
geom_sf(aes(fill = eviction_count), color = "#8f98aa")+
scale_fill_gradient(low = "#FCF5EE", high = "#7826C0", guide = guide_legend(title = "Number of Evictions")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Evictions")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
orange <- ggplot(nyc_clean) +
geom_sf(aes(fill = unemployment_count), color = "#8f98aa")+
scale_fill_gradient(low = "#FCF5EE", high = "#FC9228", guide = guide_legend(title = "Number on \nUnemployment")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Unemployment")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
green <- ggplot(nyc_clean) +
geom_sf(aes(fill = store_count), color = "#8f98aa")+
scale_fill_gradient(low = "#FCF5EE", high = "#326902", guide = guide_legend(title = "Number of Stores")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Retail Food Stores")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
blue <- ggplot(nyc_clean) +
geom_sf(aes(fill = school_count), color = "#8f98aa")+
scale_fill_gradient(low = "#FCF5EE", high = "#5372C4",
guide = guide_legend(title = "Number of Schools")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Number of Schools")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
pink <- ggplot(nyc_clean) +
geom_sf(aes(fill = uninsured_count), color = "#8f98aa")+
scale_fill_gradient(low = "#FCF5EE", high = "#F450E1", guide = guide_legend(title = "Number of People \n without Insurance Coverage")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Insurance Coverage")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
navy <- nyc_clean %>%
ggplot() +
geom_sf(aes(fill = gini), color = "#8f98aa")+
scale_fill_gradient(low = "#F8E3DD", high = "#16236f",
guide = guide_legend(title = "Gini Inequality Values"))+
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Income Inequality")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 25, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
white <- ggplot(nyc_clean) +
geom_sf(aes(fill = white_count), color = "#8f98aa") +
scale_fill_gradientn(colors = c("#FCF5EE","#BD9DA5", "#9C7080", "#7B435B"), guide = guide_legend(title = "Number White")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("White Population")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
black <- ggplot(nyc_clean) +
geom_sf(aes(fill = black_count), color = "#8f98aa") +
scale_fill_gradientn(colors = c("#FCF5EE","#F8ABA6", "#F58581", "#F25F5C"), guide = guide_legend(title = "Number Black")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Black Population")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
asian <- ggplot(nyc_clean) +
geom_sf(aes(fill = asian_count), color = "#8f98aa") +
scale_fill_gradientn(colors = c("#FCF5EE","#B8BAD9", "#959CCE", "#717EC3"), guide = guide_legend(title = "Number Asian")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Asian Population")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
latinx <- ggplot(nyc_clean) +
geom_sf(aes(fill = latinx_count), color = "#8f98aa")+
scale_fill_gradientn(colors = c("#FCF5EE","#FDC894", "#FDB166", "#FC9A38"),
guide = guide_legend(title = "Number Latinx")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Latinx Population")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
pop <- ggplot(nyc_clean) +
geom_sf(aes(fill = total_pop), color = "#8f98aa")+
scale_fill_gradientn(colors = c("#FCF5EE","#C0E3C3", "#A1D9AD", "#81CF97"), guide = guide_legend(title = "Number of People")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Total Population")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
All the data used in this project are from two major sources: the Tidycensus package and NYC Open Data.
Tidycensus is an R package interface, developed by Kyle Walker and Matt Herman, that enables easy access to the US Census Bureau’s data APIs and returns Tidyverse-ready data frames from various major US Census Bureau datasets. Our demographic and socioeconomic data are drawn from the American Community Survey results found in Tidycensus package. A summary of our ACS data variables is below:
borough:Each Neighborhood’s Borough.total_pop: Total Population by Neighborhoodmean_income: Mean Income by Neighborhoodbelow_poverty_line_count: Number of People Living Below the 100% Poverty Line by Neighborhoodmean_rent: Mean Rent by Neighborhoodunemployment_count: Number of People on Unemployment by Neighborhoodlatinx_count: Number of Latinx People by Neighborhoodwhite_count: Number of White People by Neighborhoodblack_count: Number of Black People by Neighborhoodnative_count: Number of Native People by Neighborhoodasian_count: Number of Asian People by Neighborhoodnaturalized_citizen_count: Number of Naturalized Citizens by Neighborhoodnoncitizen_count: Number of Foreign Born People by Neighborhooduninsured_count: Number of Uninsured Citizens of any Age by NeighborhoodFor remaining predictors, we used NYC Open Data’s portal to identify specific predictors. In particular, we used geotagged locations of Subway Stops, Bus Stops, Grocery Stores, Schools, and Eviction Sites from the Departments of Transportation, Health, Education, and Housing to calculate neighborhood-specific variables described below:
school_count: Number of Public Schools by Neighborhoodeviction_count: Number of Evictions by Neighborhoodstore_count: Number of Grocery Stores and Food Vendors by Neighborhoodsub_count: Number of Subway Stations by Neighborhoodbus_count: Number of Bus Stations by Neighborhoodperc_covered_by_transit: Percent of Neighborhood Within Walking Distance (.25 miles) of Any Subway Stop.transportation_desert_4cat: Subway Accessibility by Neighborhood (None, Limited, Satisfactory, Excellent)Lastly, we acquired subway ridership from Metropolitan Transportation Authority’s turnstile data for the week of September 7, 2019. For each station, entry/exit of each turnstile is recorded. Then, we aggregated this information by taking the station-specific average of subway ridership across the 7 days in the week. Finally, we geotagged each listed station, then took the mean of ridership at all subway stations in each neighborhood to create.
mean_ridership: Mean Subway Ridership by Neighborhood for the week of September 7th.
Our data has 224 observations of 26 variables. Below is a preview of our dataset with colnames attached.
library(kableExtra)
kable(head(nyc_clean, n=3)) %>% kable_styling() %>% scroll_box(width = "100%", height = "200px")
| nta_id | total_pop | mean_income | below_poverty_line_count | below_poverty_line_and_50_count | mean_rent | unemployment_count | latinx_count | white_count | black_count | native_count | asian_count | naturalized_citizen_count | noncitizen_count | uninsured_count | school_count | eviction_count | store_count | sub_count | bus_count | mean_ridership | perc_covered_by_transit | transportation_desert_4cat | borough | geometry | assault_count | sex_crime_count | gini |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| BK0101 | 26308 | 98338.67 | 2397 | 1289 | 2062.667 | 582 | 3284 | 20526 | 482 | 40 | 1052 | 3777 | 3129 | 1797 | 6 | 68 | 71 | 2 | 53 | 9410.500 | 78.76390 | Satisfactory | Brooklyn | MULTIPOLYGON (((-73.94074 4… | 22 | 7 | 0.4420550 |
| BK0102 | 57774 | 101238.92 | 9120 | 4474 | 2330.077 | 1710 | 18227 | 32237 | 1460 | 0 | 4008 | 6802 | 7746 | 3725 | 12 | 204 | 129 | 2 | 97 | 26603.000 | 89.80852 | Satisfactory | Brooklyn | MULTIPOLYGON (((-73.96355 4… | 14 | 14 | 0.4859323 |
| BK0103 | 36891 | 30309.25 | 18285 | 5970 | 1194.875 | 457 | 3351 | 31799 | 1288 | 20 | 194 | 3548 | 1012 | 711 | 6 | 45 | 58 | 3 | 35 | 6348.667 | 88.13439 | Satisfactory | Brooklyn | MULTIPOLYGON (((-73.96762 4… | 7 | 1 | 0.4937143 |
Below is a numeric summary of each variable’s distribution.
#summary(nyc_clean)
library(table1)
table_print <- table1(~ total_pop + mean_income + below_poverty_line_count+
mean_rent + unemployment_count + white_count + uninsured_count + school_count + eviction_count + store_count + transportation_desert_4cat+ sub_count + bus_count + mean_ridership | borough, data = nyc_clean %>% as_tibble()) %>% as_tibble()
colnames(table_print) <- c("Variable", "Bronx (N=44)", "Brooklyn (N=64)","Manhattan (N=39)", "Queens (N=77)", "Overall (N=224)")
table_print%>%
filter(Variable!="") %>% kable() %>% kable_styling() %>% scroll_box(width = "100%", height = "500px")
| Variable | Bronx (N=44) | Brooklyn (N=64) | Manhattan (N=39) | Queens (N=77) | Overall (N=224) |
|---|---|---|---|---|---|
| total_pop | |||||
| Mean (SD) | 30200 (18700) | 37800 (24600) | 38300 (24600) | 25900 (20900) | 32300 (22800) |
| Median [Min, Max] | 29800 [0, 69200] | 38300 [0, 97800] | 35700 [0, 95300] | 25000 [0, 87700] | 31600 [0, 97800] |
| mean_income | |||||
| Mean (SD) | 43400 (17900) | 69600 (28700) | 103000 (49700) | 74500 (14400) | 71900 (33900) |
| Median [Min, Max] | 38000 [23100, 94200] | 61200 [27400, 148000] | 108000 [33300, 212000] | 72600 [37500, 104000] | 67200 [23100, 212000] |
| Missing | 8 (18.2%) | 9 (14.1%) | 6 (15.4%) | 19 (24.7%) | 42 (18.8%) |
| below_poverty_line_count | |||||
| Mean (SD) | 8360 (6490) | 7430 (6120) | 6180 (6100) | 3090 (2900) | 5900 (5700) |
| Median [Min, Max] | 7260 [0, 21600] | 6880 [0, 28800] | 3290 [0, 22800] | 2680 [0, 11600] | 4220 [0, 28800] |
| mean_rent | |||||
| Mean (SD) | 1230 (197) | 1580 (465) | 1960 (674) | 1650 (198) | 1600 (465) |
| Median [Min, Max] | 1240 [833, 1620] | 1450 [792, 3280] | 2070 [884, 3270] | 1630 [1140, 2250] | 1510 [792, 3280] |
| Missing | 8 (18.2%) | 9 (14.1%) | 6 (15.4%) | 19 (24.7%) | 42 (18.8%) |
| unemployment_count | |||||
| Mean (SD) | 1400 (972) | 1180 (903) | 1210 (1100) | 756 (685) | 1080 (916) |
| Median [Min, Max] | 1330 [0, 3150] | 1120 [0, 3770] | 952 [0, 4700] | 668 [0, 3150] | 919 [0, 4700] |
| white_count | |||||
| Mean (SD) | 2830 (4990) | 13900 (14200) | 17200 (14700) | 6740 (8500) | 9850 (12300) |
| Median [Min, Max] | 919 [0, 27500] | 10900 [0, 64500] | 13800 [0, 69300] | 4200 [0, 43900] | 4960 [0, 69300] |
| uninsured_count | |||||
| Mean (SD) | 2570 (1990) | 2750 (2260) | 2030 (2150) | 2350 (2510) | 2450 (2280) |
| Median [Min, Max] | 2340 [0, 8030] | 2610 [0, 10100] | 1380 [0, 10300] | 1660 [0, 12200] | 1910 [0, 12200] |
| school_count | |||||
| Mean (SD) | 8.64 (7.44) | 8.16 (6.79) | 8.54 (7.79) | 4.08 (2.88) | 6.92 (6.41) |
| Median [Min, Max] | 5.50 [1.00, 27.0] | 6.00 [1.00, 31.0] | 5.00 [1.00, 28.0] | 3.00 [1.00, 12.0] | 5.00 [1.00, 31.0] |
| eviction_count | |||||
| Mean (SD) | 438 (340) | 245 (234) | 223 (233) | 126 (131) | 238 (255) |
| Median [Min, Max] | 406 [1.00, 1130] | 163 [1.00, 829] | 152 [1.00, 1120] | 93.0 [1.00, 521] | 148 [1.00, 1130] |
| store_count | |||||
| Mean (SD) | 53.2 (38.6) | 69.8 (49.5) | 58.6 (39.8) | 33.1 (33.8) | 52.0 (43.1) |
| Median [Min, Max] | 48.5 [1.00, 138] | 70.5 [1.00, 202] | 54.0 [1.00, 151] | 24.0 [1.00, 147] | 45.0 [1.00, 202] |
| transportation_desert_4cat | |||||
| Poor | 4 (9.1%) | 8 (12.5%) | 2 (5.1%) | 40 (51.9%) | 54 (24.1%) |
| Limited | 16 (36.4%) | 16 (25.0%) | 2 (5.1%) | 22 (28.6%) | 56 (25.0%) |
| Satisfactory | 11 (25.0%) | 18 (28.1%) | 11 (28.2%) | 12 (15.6%) | 52 (23.2%) |
| Excellent | 13 (29.5%) | 22 (34.4%) | 24 (61.5%) | 3 (3.9%) | 62 (27.7%) |
| sub_count | |||||
| Mean (SD) | 1.95 (1.33) | 2.77 (2.06) | 4.03 (3.53) | 1.55 (1.22) | 2.41 (2.23) |
| Median [Min, Max] | 1.00 [1.00, 7.00] | 2.00 [1.00, 9.00] | 3.00 [1.00, 17.0] | 1.00 [1.00, 6.00] | 1.00 [1.00, 17.0] |
| bus_count | |||||
| Mean (SD) | 40.1 (22.9) | 60.2 (41.1) | 46.6 (26.0) | 56.9 (45.0) | 52.7 (38.0) |
| Median [Min, Max] | 43.5 [1.00, 125] | 59.0 [1.00, 170] | 44.0 [2.00, 106] | 52.0 [1.00, 243] | 49.0 [1.00, 243] |
| mean_ridership | |||||
| Mean (SD) | 7180 (3410) | 7400 (4690) | 22900 (19500) | 10900 (12200) | 12000 (13300) |
| Median [Min, Max] | 6670 [2420, 15700] | 6610 [1040, 26600] | 18000 [5640, 110000] | 7920 [273, 55700] | 7960 [273, 110000] |
| Missing | 21 (47.7%) | 18 (28.1%) | 7 (17.9%) | 54 (70.1%) | 100 (44.6%) |
library(ggridges)
plot_1<-nyc_clean %>%
ggplot(aes(x=mean_income, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="Mean Income", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_2<-nyc_clean %>%
ggplot(aes(x=below_poverty_line_count, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="Number Below Poverty Line", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_3<-nyc_clean %>%
ggplot(aes(x=mean_rent, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="Mean Rent", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_4<-nyc_clean %>%
ggplot(aes(x=unemployment_count, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="Unemployed Counts", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_5<-nyc_clean %>%
ggplot(aes(x=white_count, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="White Counts", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_6<-nyc_clean %>%
ggplot(aes(x=uninsured_count, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="Uninsured Counts", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_7<-nyc_clean %>%
ggplot(aes(x=school_count, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="School Counts", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_8<-nyc_clean %>%
ggplot(aes(x=eviction_count, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="Eviction Counts", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_9<-nyc_clean %>%
ggplot(aes(x=store_count, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="Food Retail Counts", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_10<-nyc_clean %>%
ggplot(aes(x=borough, fill=transportation_desert_4cat), alpha=.6) +
geom_bar(position="fill") +
scale_y_continuous(labels = seq(0, 100, by = 25)) +
labs(title="Subway Accessibility", y="", x="")+
theme(panel.grid.major = element_line("transparent"),
# axis.text.y.left = element_blank(),
axis.text.x.bottom = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold")) +
scale_fill_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6")
plot_11<-nyc_clean %>%
ggplot(aes(x=sub_count, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="Subway Stop Counts", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_12<-nyc_clean %>%
ggplot(aes(x=bus_count, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="Bus Stop Counts", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
plot_13<-nyc_clean %>%
ggplot(aes(x=mean_ridership, y=borough, fill=borough), alpha=.6) +
geom_density_ridges() +
labs(title="Mean Ridership", y="")+
theme(panel.grid.major = element_line("transparent"),
axis.text.y.left = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 28,hjust=.5, face = "bold"),
legend.position="none") +
scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
library(egg)
ggarrange(plot_1, plot_2, plot_3,
plot_4, plot_5, plot_6,
plot_7, plot_8, plot_9, plot_11, plot_12,
plot_13, plot_10,
ncol=4)
library(egg)
ggarrange(subway_loc, bus_loc, stops, bus_stops, ridership, access, ncol=3)
ggarrange(red, orange, yellow, green, teal, blue, navy, purple, pink, ncol=3)
ggarrange(pop, white, black, latinx, asian, ncol=3)
nyc_clean %>%
ggplot() +
geom_sf(aes(fill = sex_crime_count), color = "#8f98aa")+
scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide = guide_legend(title = "Sex-Based Crime Counts")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Sex-Based Violence")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 25, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
nyc_clean %>%
ggplot() +
geom_sf(aes(fill = assault_count), color = "#8f98aa")+
scale_fill_gradient(low = "#F8E3DD", high = "#95174F", guide = guide_legend(title = "Felony Assault Counts")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("1st and 2nd Degree Felony \nAssault")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 25, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
We use poisson and negative binomial regression to model observed white counts, \(i\), using our \(k=10\) predictors:
- mean_income
- mean_rent
- unemployment_count
- sub_count
- transportation_desert_4cat
- school_count
- store_count
- bus_count
- eviction_count
- uninsured_count
\[\begin{split} \text{White}_{i} \mid \beta_{0}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{i}) \; \; \; \; \text{where} \log(\lambda_{i}) = \beta_{0} + \sum^{10}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \end{split}\]
poisson_non_hierarchical <- stan_glm(
white_count ~ mean_income + mean_rent +
unemployment_count +
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count,
data = nyc_clean,
family = poisson,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_non_hierarchical) +
xlab("White Resident Count") +
labs(title = "Poisson")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_poisson <- posterior_predict(
poisson_non_hierarchical, newdata = nyc_predict_clean)
library(tidybayes)
library(bayesplot)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(poisson_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchical Poisson - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 3419.8903715 | 0.0038365 | 3404.9150441 | 3438.9527477 |
| mean_income | 0.0010119 | 0.0000000 | 0.0010055 | 0.0010168 |
| mean_rent | -0.0304929 | 0.0000036 | -0.0309057 | -0.0300877 |
| unemployment_count | 0.0379527 | 0.0000018 | 0.0377482 | 0.0381685 |
| transportation_desert_4catLimited | 71.5709164 | 0.0027320 | 71.0167804 | 72.1022773 |
| transportation_desert_4catSatisfactory | 79.4589797 | 0.0028885 | 78.8097506 | 80.0479727 |
| transportation_desert_4catExcellent | 93.6889140 | 0.0028636 | 92.9789485 | 94.3611356 |
| school_count | 0.1367263 | 0.0001385 | 0.1189483 | 0.1548387 |
| store_count | 1.0357874 | 0.0000282 | 1.0321909 | 1.0392870 |
| bus_count | 0.4829094 | 0.0000224 | 0.4800717 | 0.4854354 |
| eviction_count | -0.3405727 | 0.0000054 | -0.3412664 | -0.3398966 |
| uninsured_count | -0.0082767 | 0.0000005 | -0.0083515 | -0.0082204 |
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \; \; \; \; \text{where} \log(\mu_{i}) = \beta_{0} + \sum^{11}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \end{split}\]
negbin_non_hierarchical <- stan_glm(
white_count ~ mean_income + mean_rent +
unemployment_count +
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count,
data = nyc_clean,
family = neg_binomial_2,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(negbin_non_hierarchical) +
xlab("White Resident Count") +
labs(title = "Negative Binomial")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_negbin <- posterior_predict(
negbin_non_hierarchical, newdata = nyc_predict_clean)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 1903.3130070 | 0.4882711 | 1053.9749617 | 3690.9242147 |
| mean_income | 0.0014141 | 0.0000069 | 0.0007642 | 0.0022762 |
| unemployment_count | 0.0439401 | 0.0001686 | 0.0238975 | 0.0644908 |
| transportation_desert_4catLimited | 90.2330125 | 0.2297732 | 36.6221884 | 160.7311834 |
| transportation_desert_4catSatisfactory | 91.4182213 | 0.3000295 | 31.9686484 | 178.8491978 |
| transportation_desert_4catExcellent | 123.4284965 | 0.2783874 | 56.9296869 | 211.6754022 |
| store_count | 0.8923541 | 0.0032509 | 0.4765713 | 1.3486077 |
| bus_count | 0.7071825 | 0.0033973 | 0.1927714 | 1.1140631 |
| eviction_count | -0.3279124 | 0.0004594 | -0.3830488 | -0.2664993 |
table1 <- prediction_summary(model=poisson_non_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Poisson")
table2 <- prediction_summary(model=negbin_non_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Negative Binomial")
Negative binomial has much better error metrics.
We use poisson and negative binomial hierarchical regression to model observed white counts, \(i\), by boroughs \(j\), usin our \(k=11\) predictors. Note we are creating 3 dummy variables associated with
transportation_desert_4cat:
- mean_income
- mean_rent
- unemployment_count
- transportation_desert_4cat
- school_count
- store_count
- bus_count
- eviction_count
- uninsured_count
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{ij}) \\ & \text{where} \log(\lambda_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \sigma_0 & \sim Exp(1) \end{split}\]
poisson_hierarchical <- stan_glmer(
white_count ~ mean_income + mean_rent +
unemployment_count +
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count + (1 | borough),
data = nyc_clean,
family = poisson,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_hierarchical) +
xlab("White Resident Count") +
labs(title = "Poisson")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_poisson <- posterior_predict(
poisson_hierarchical, newdata = nyc_predict_clean)
library(tidybayes)
library(bayesplot)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(poisson_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Hierarchicahl Poisson - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 3125.2861065 | 0.5888946 | 0.7030650 | 4682.1874542 |
| mean_income | 0.0007656 | 0.0000001 | 0.0007519 | 0.0007729 |
| mean_rent | -0.0294444 | 0.0000041 | -0.0299564 | -0.0287658 |
| unemployment_count | 0.0354221 | 0.0000018 | 0.0351726 | 0.0356746 |
| transportation_desert_4catLimited | 71.3728705 | 0.0037461 | 69.2576328 | 72.2937517 |
| transportation_desert_4catSatisfactory | 60.1612356 | 0.0037984 | 57.7672623 | 61.1822450 |
| transportation_desert_4catExcellent | 66.0700782 | 0.0041175 | 63.6562602 | 67.0730274 |
| school_count | -0.2363096 | 0.0001729 | -0.2653370 | -0.2176155 |
| store_count | 0.8335427 | 0.0000397 | 0.8278885 | 0.8445275 |
| bus_count | 0.5955520 | 0.0000333 | 0.5910621 | 0.6418625 |
| eviction_count | -0.3312319 | 0.0000077 | -0.3337286 | -0.3304165 |
| uninsured_count | -0.0063382 | 0.0000007 | -0.0065424 | -0.0062500 |
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \\ & \text{where} \log(\mu_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \sigma_0 & \sim Exp(1) \end{split}\]
negbin_hierarchical <- stan_glmer(
white_count ~ mean_income + mean_rent +
unemployment_count +
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count + (1 | borough),
data = nyc_clean,
family = neg_binomial_2,
chains = 2, iter = 100*2, seed = 84735, refresh = 0)
pp_check(negbin_hierarchical) +
xlab("White Resident Count") +
labs(title = "Negative Binomial")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_negbin <- posterior_predict(
negbin_hierarchical, newdata = nyc_predict_clean)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(negbin_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 1845.8821423 | 0.5406396 | 878.8645051 | 3555.6655106 |
| mean_income | 0.0014266 | 0.0000057 | 0.0006792 | 0.0021613 |
| unemployment_count | 0.0432225 | 0.0001767 | 0.0185096 | 0.0689402 |
| transportation_desert_4catLimited | 108.0384501 | 0.2230049 | 49.2121177 | 175.9991654 |
| transportation_desert_4catSatisfactory | 84.1263761 | 0.2773221 | 35.0198573 | 169.6584737 |
| transportation_desert_4catExcellent | 105.7654825 | 0.3111778 | 37.2125286 | 190.9217058 |
| store_count | 0.7741202 | 0.0026634 | 0.3986937 | 1.1628353 |
| bus_count | 0.5612179 | 0.0035110 | 0.1446237 | 1.0169595 |
| eviction_count | -0.2979056 | 0.0004969 | -0.3636314 | -0.2279982 |
draft <- nyc_clean %>% drop_na(mean_rent) %>% drop_na(mean_income) %>% st_as_sf()
col_sp <- as(draft, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
moran.mc(col_sp$white_count, listw = col_listw, nsim = 999, alternative = "greater")
##
## Monte-Carlo simulation of Moran I
##
## data: col_sp$white_count
## weights: col_listw
## number of simulations + 1: 1000
##
## statistic = 0.45512, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
col_sp <- as(draft, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
M.burnin <- 10000 # Number of burn-in iterations (discarded)
M <- 1000 # Number of iterations retained
model.assault <- S.CARleroux(
white_count ~ 1 + gini +
mean_income + mean_rent +
unemployment_count +
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count,
data = draft,
family = "poisson",
W = W,
burnin = M.burnin,
n.sample = M.burnin + M, # Total iterations
verbose = FALSE)
as.data.frame(model.assault$summary.results) %>%
rownames_to_column("term") %>%
kable()%>%
kable_styling() %>%
scroll_box(width = "100%", height = "200px")
| term | Median | 2.5% | 97.5% | n.sample | % accept | n.effective | Geweke.diag |
|---|---|---|---|---|---|---|---|
| (Intercept) | 7.1376 | 7.0747 | 7.1878 | 1000 | 43.5 | 3.8 | -0.6 |
| gini | 4.3551 | 4.2764 | 4.4929 | 1000 | 43.5 | 5.1 | -0.7 |
| mean_income | 0.0000 | 0.0000 | 0.0000 | 1000 | 43.5 | 3.9 | -3.2 |
| mean_rent | -0.0006 | -0.0006 | -0.0005 | 1000 | 43.5 | 2.3 | 3.2 |
| unemployment_count | 0.0003 | 0.0002 | 0.0003 | 1000 | 43.5 | 2.4 | -0.3 |
| transportation_desert_4catLimited | 0.7162 | 0.7022 | 0.7219 | 1000 | 43.5 | 3.6 | 3.2 |
| transportation_desert_4catSatisfactory | 0.1018 | 0.0914 | 0.1057 | 1000 | 43.5 | 4.6 | -7.1 |
| transportation_desert_4catExcellent | 0.3500 | 0.3455 | 0.3591 | 1000 | 43.5 | 2.6 | 4.8 |
| school_count | 0.0088 | 0.0074 | 0.0100 | 1000 | 43.5 | 3.1 | -3.9 |
| store_count | 0.0087 | 0.0083 | 0.0090 | 1000 | 43.5 | 1.9 | -5.7 |
| bus_count | -0.0013 | -0.0015 | -0.0012 | 1000 | 43.5 | 7.2 | 4.4 |
| eviction_count | -0.0035 | -0.0035 | -0.0035 | 1000 | 43.5 | 8.8 | -0.1 |
| uninsured_count | -0.0001 | -0.0001 | -0.0001 | 1000 | 43.5 | 2.7 | 9.4 |
| tau2 | 3.0440 | 2.2179 | 4.2320 | 1000 | 100.0 | 126.6 | -1.2 |
| rho | 0.5190 | 0.2848 | 0.7856 | 1000 | 42.3 | 92.2 | -0.9 |
p_plot <- round((moran.mc(x = as.vector(model.assault$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
draft %>%
mutate(resid = model.assault$residuals$response) %>%
ggplot(aes(fill = resid), color = "#8f98aa") +
geom_sf()+
scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
guide_legend(title = "Residual")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle(paste0("1st and Degree Felony Assault: \nPoisson Model Residuals (", p_plot, ")"))+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 25, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
col_sp <- as(draft, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
M.burnin <- 10000 # Number of burn-in iterations (discarded)
M <- 1000 # Number of iterations retained
model.assault <- S.CARleroux(
white_count ~ 1 + gini +
mean_income + mean_rent +
unemployment_count +
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count,
data=draft,
family = "zip",
formula.omega = ~1,
W = W,
burnin = M.burnin,
n.sample = M.burnin + M, # Total iterations
verbose = FALSE)
as.data.frame(model.assault$summary.results) %>%
rownames_to_column("term") %>%
kable()%>%
kable_styling() %>%
scroll_box(width = "100%", height = "200px")
| term | Median | 2.5% | 97.5% | n.sample | % accept | n.effective | Geweke.diag |
|---|---|---|---|---|---|---|---|
| (Intercept) | 7.3812 | 7.3216 | 7.4279 | 1000 | 47.0 | 6.2 | 1.8 |
| gini | 2.2399 | 2.1086 | 2.3265 | 1000 | 47.0 | 5.4 | -2.7 |
| mean_income | 0.0000 | 0.0000 | 0.0000 | 1000 | 47.0 | 2.7 | 2.8 |
| mean_rent | -0.0001 | -0.0001 | -0.0001 | 1000 | 47.0 | 3.9 | 0.9 |
| unemployment_count | 0.0004 | 0.0004 | 0.0004 | 1000 | 47.0 | 2.9 | -3.6 |
| transportation_desert_4catLimited | 0.0517 | 0.0460 | 0.0555 | 1000 | 47.0 | 9.4 | -4.1 |
| transportation_desert_4catSatisfactory | 0.1412 | 0.1327 | 0.1477 | 1000 | 47.0 | 3.6 | -2.8 |
| transportation_desert_4catExcellent | 0.3954 | 0.3840 | 0.4018 | 1000 | 47.0 | 2.5 | 1.5 |
| school_count | -0.0146 | -0.0154 | -0.0136 | 1000 | 47.0 | 8.2 | 4.3 |
| store_count | 0.0051 | 0.0049 | 0.0052 | 1000 | 47.0 | 3.6 | 4.8 |
| bus_count | 0.0023 | 0.0020 | 0.0026 | 1000 | 47.0 | 2.2 | 5.7 |
| eviction_count | -0.0026 | -0.0027 | -0.0026 | 1000 | 47.0 | 1.5 | -9.6 |
| uninsured_count | 0.0000 | 0.0000 | 0.0000 | 1000 | 47.0 | 2.4 | -8.7 |
| omega - (Intercept) | -41363.9008 | -41363.9008 | -41363.9008 | 1000 | 0.0 | 0.0 | NaN |
| tau2 | 2.7412 | 2.1448 | 3.5727 | 1000 | 100.0 | 306.7 | -1.9 |
| rho | 0.7027 | 0.4675 | 0.8935 | 1000 | 43.1 | 192.8 | -1.9 |
p_plot <- round((moran.mc(x = as.vector(model.assault$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
draft %>%
mutate(resid = model.assault$residuals$response) %>%
ggplot(aes(fill = resid), color = "#8f98aa") +
geom_sf()+
scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
guide_legend(title = "Residual")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle(paste0("1st and Degree Felony Assault: \nZIP Model Residuals (", p_plot, ")"))+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 25, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
table3 <- prediction_summary(model=poisson_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Hierarchichal Poisson")
table4 <- prediction_summary(model=negbin_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Hierarchichal Negative Binomial")
rbind(table1, table2, table3, table4) %>%
dplyr::mutate(model = Model, .before=1) %>%
dplyr::select(-Model)%>% kable() %>% kable_styling()
| model | mae | mae_scaled | within_50 | within_95 |
|---|---|---|---|---|
| Poisson | 3835.725 | 43.613943 | 0.0000000 | 0.0083333 |
| Negative Binomial | 4416.318 | 0.415421 | 0.5916667 | 0.9833333 |
| Hierarchichal Poisson | 3641.245 | 3.293774 | 0.0166667 | 0.3666667 |
| Hierarchichal Negative Binomial | 4072.892 | 0.411570 | 0.6250000 | 0.9833333 |
Using in-sample scaled MAE, it’s evident our negative binomial regression models, regardless of hierarchy, performed better than our poisson regression models. However, the differences between the two negative binomial models was neglible. So, we will more closely interpret the non-hierarchichal negative binomial regression model.
tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 1903.3130070 | 0.4882711 | 1053.9749617 | 3690.9242147 |
| mean_income | 0.0014141 | 0.0000069 | 0.0007642 | 0.0022762 |
| unemployment_count | 0.0439401 | 0.0001686 | 0.0238975 | 0.0644908 |
| transportation_desert_4catLimited | 90.2330125 | 0.2297732 | 36.6221884 | 160.7311834 |
| transportation_desert_4catSatisfactory | 91.4182213 | 0.3000295 | 31.9686484 | 178.8491978 |
| transportation_desert_4catExcellent | 123.4284965 | 0.2783874 | 56.9296869 | 211.6754022 |
| store_count | 0.8923541 | 0.0032509 | 0.4765713 | 1.3486077 |
| bus_count | 0.7071825 | 0.0033973 | 0.1927714 | 1.1140631 |
| eviction_count | -0.3279124 | 0.0004594 | -0.3830488 | -0.2664993 |
After removing predictors whose 95% credible intervals included the possibility of non-effect when controlling for other covariates, we found that there were 6 remaining predictors of an arbitrary neighborhood’s white population count:
- Mean income by neighborhood
- Unemployment counts by neighborhood
- Transportation desert status by neighborhood
- Food Vendor counts by neighborhood
- Bus station counts by neighborhood
- Eviction counts by neighborhood.
Next, we interpret each predictor:
- Mean Income: When controlling for all other predictors, 1 dollar increases in mean neighborhood rental prices are associated with approximately 0.00143% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.000330%, 0.00268%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
- Unemployment Count: When controlling for all other predictors, 1 dollar increases in unemployed people counts by neighborhood are associated with approximately 0.0493% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0140%, 0.0790%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Limited Subway Access: When controlling for all other predictors, a neighborhood with limited access to subway transit is expected to have approximately 125.753% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (51.508%, 269.541%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Satisfactory Subway Access: When controlling for all other predictors, a neighborhood with satisfactory access to subway transit is expected to have approximately 136.342% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (44.081%, 282.669%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Excellent Subway Access: When controlling for all other predictors, a neighborhood with excellent access to subway transit is expected to have approximately 98.186% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (15.762%, 275.969%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Food Vendor Count: When controlling for all other predictors, 1 store increases in the number of food vendors by neighborhood are associated with approximately 0.759% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.261%, 1.469%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Bus Stop Count: When controlling for all other predictors, 1 stop increases in the number of bus stop by neighborhood are associated with approximately 0.6516973% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0402706%, 1.1496164%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Eviction Count: When controlling for all other predictors, increases in the number of evictions by neighborhood are associated with approximately 0.340% decreases in the white population count. However, there is a 95% chance that the relationship may be any value between (-0.425%, -0.248%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
If we intended to essentialize this list,
transportation_desert_4,eviction_count,store_countseem to be the most informative predictors.
# Load the package
library(e1071)
# Run the algorithm using only data on bill length
naive_model_1 <- naiveBayes(transportation_desert_4cat ~ mean_income + mean_rent + unemployment_count + school_count + store_count + bus_count + eviction_count + uninsured_count + white_count+ black_count+ latinx_count + asian_count, data = nyc_clean)
grid_data <- expand_grid(
black_count = seq(min((nyc_clean %>% drop_na(black_count))$black_count), max((nyc_clean %>% drop_na(black_count))$black_count), length = 100),
white_count = seq(min((nyc_clean %>% drop_na(white_count))$white_count), max((nyc_clean %>% drop_na(white_count))$white_count), length = 100)) %>%
mutate(classification = predict(naive_model_1, newdata = .))
# Plot the classification regions
plot_1 <- ggplot(grid_data, aes(x = black_count, y = white_count, color = classification)) +
geom_point(alpha = 1, size=1.5) +
scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") +
theme_minimal() +
ggtitle("Black and White Population") +
labs(x="Black Count", y="White Count", color="Subway Accessibility")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, hjust=.5, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
grid_data <- expand_grid(
latinx_count = seq(min((nyc_clean %>% drop_na(latinx_count))$latinx_count), max((nyc_clean %>% drop_na(latinx_count))$latinx_count), length = 100),
white_count = seq(min((nyc_clean %>% drop_na(white_count))$white_count), max((nyc_clean %>% drop_na(white_count))$white_count), length = 100)) %>%
mutate(classification = predict(naive_model_1, newdata = .))
# Plot the classification regions
plot_2 <- ggplot(grid_data, aes(x = latinx_count, y = white_count, color = classification)) +
geom_point(alpha = 1, size=1.5) +
scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") +
theme_minimal() +
ggtitle("Latinx and White Population") +
labs(x="Latinx Count", y="White Count", color="Subway Accessibility")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, hjust=.5, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
grid_data <- expand_grid(
asian_count = seq(min((nyc_clean %>% drop_na(asian_count))$asian_count), max((nyc_clean %>% drop_na(asian_count))$asian_count), length = 100),
white_count = seq(min((nyc_clean %>% drop_na(white_count))$white_count), max((nyc_clean %>% drop_na(white_count))$white_count), length = 100)) %>%
mutate(classification = predict(naive_model_1, newdata = .))
# Plot the classification regions
plot_3 <- ggplot(grid_data, aes(x = asian_count, y = white_count, color = classification)) +
geom_point(alpha = 1, size=1.5) +
scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") +
theme_minimal() +
ggtitle("Asian and White Population") +
labs(x="Asian Count", y="White Count", color="Subway Accessibility")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, hjust=.5, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
ggpubr::ggarrange(plot_1, plot_2, plot_3, ncol=3,nrow = 1, common.legend=TRUE, legend = "bottom")
grid_data <- expand_grid(
mean_income = seq(min((nyc_clean %>% drop_na(mean_income))$mean_income), max((nyc_clean %>% drop_na(mean_income))$mean_income), length = 100),
white_count = seq(min((nyc_clean %>% drop_na(white_count))$white_count), max((nyc_clean %>% drop_na(white_count))$white_count), length = 100)) %>%
mutate(classification = predict(naive_model_1, newdata = .))
# Plot the classification regions
white <- ggplot(grid_data, aes(x = mean_income, y = white_count, color = classification)) +
geom_point(alpha = 1, size=1.5) +
scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") +
theme_minimal() +
ggtitle("White Population") +
labs(x="Mean Neighborhood Income", y="Count", color="Subway Accessibility")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, hjust=.5, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
grid_data <- expand_grid(
mean_income = seq(min((nyc_clean %>% drop_na(mean_income))$mean_income), max((nyc_clean %>% drop_na(mean_income))$mean_income), length = 100),
black_count = seq(min((nyc_clean %>% drop_na(black_count))$black_count), max((nyc_clean %>% drop_na(black_count))$black_count), length = 100)) %>%
mutate(classification = predict(naive_model_1, newdata = .))
black <-ggplot(grid_data, aes(x = mean_income, y = black_count, color = classification)) +
geom_point(alpha = 1, size=1.5) +
scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") +
ggtitle("Black Population")+
labs(x="Mean Neighborhood Income", y="Count", color="Subway Accessibility")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, hjust=.5, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
grid_data <- expand_grid(
mean_income = seq(min((nyc_clean %>% drop_na(mean_income))$mean_income), max((nyc_clean %>% drop_na(mean_income))$mean_income), length = 100),
latinx_count = seq(min((nyc_clean %>% drop_na(latinx_count))$latinx_count), max((nyc_clean %>% drop_na(latinx_count))$latinx_count), length = 100)) %>%
mutate(classification = predict(naive_model_1, newdata = .))
latinx<-ggplot(grid_data, aes(x = mean_income, y = latinx_count, color = classification)) +
geom_point(alpha = 1, size=1.5) +
scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") +
theme_minimal() +
ggtitle("Latinx Population") +
labs(x="Mean Neighborhood Income", y="Count", color="Subway Accessibility")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, hjust=.5, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
grid_data <- expand_grid(
mean_income = seq(min((nyc_clean %>% drop_na(mean_income))$mean_income), max((nyc_clean %>% drop_na(mean_income))$mean_income), length = 100),
asian_count = seq(min((nyc_clean %>% drop_na(asian_count))$asian_count), max((nyc_clean %>% drop_na(asian_count))$asian_count), length = 100)) %>%
mutate(classification = predict(naive_model_1, newdata = .))
asian <- ggplot(grid_data, aes(x = mean_income, y = asian_count, color = classification)) +
geom_point(alpha = 1, size=1.5) +
scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") +
ggtitle("Asian Population")+
labs(x="Mean Neighborhood Income", y="Count", color="Subway Accessibility")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, hjust=.5, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
ggpubr::ggarrange(white, black, latinx, asian, ncol=2,nrow = 2, common.legend=TRUE, legend = "bottom", label.x = )
grid_data <- expand_grid(
mean_income = seq(min((nyc_clean %>% drop_na(mean_income))$mean_income), max((nyc_clean %>% drop_na(mean_income))$mean_income), length = 100),
mean_rent = seq(min((nyc_clean %>% drop_na(mean_rent))$mean_rent), max((nyc_clean %>% drop_na(mean_rent))$mean_rent), length = 100)) %>%
mutate(classification = predict(naive_model_1, newdata = .))
# Plot the classification regions
ggplot(grid_data, aes(x = mean_income, y = mean_rent, color = classification)) +
geom_point(alpha = 1, size=1) +
scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") +
theme_minimal() +
ggtitle("Mean Rental Price") +
labs(x="Mean Neighborhood Income", y="Mean Neighborhood Rental Price", color="Subway Accessibility")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, hjust=.5, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
takeaways:
grid_data <- expand_grid(
unemployment_count = seq(min((nyc_clean %>% drop_na(unemployment_count))$unemployment_count), max((nyc_clean %>% drop_na(unemployment_count))$unemployment_count), length = 100),
eviction_count = seq(min((nyc_clean %>% drop_na(eviction_count))$eviction_count), max((nyc_clean %>% drop_na(eviction_count))$eviction_count), length = 100)) %>%
mutate(classification = predict(naive_model_1, newdata = .))
# Plot the classification regions
ggplot(grid_data, aes(x = unemployment_count, y = eviction_count, color = classification)) +
geom_point(alpha = 1, size=1) +
scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") +
theme_minimal() +
ggtitle("Eviction by Unemployment Counts") +
labs(x="Unemployment Count", y="Eviction Count", color="Subway Accessibility")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, hjust=.5, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
takeaways:
We use poisson and negative binomial regression to model observed eviction counts, \(i\), using our \(k=10\) predictors:
- gini
- mean_income
- white_count
- black_count
- latinx_count
- asian_count
- mean_rent
- unemployment_count
- sub_count
- transportation_desert_4cat
- school_count
- store_count
- bus_count
- eviction_count
- uninsured_count
\[\begin{split} \text{White}_{i} \mid \beta_{0}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{i}) \; \; \; \; \text{where} \log(\lambda_{i}) = \beta_{0} + \sum^{10}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \end{split}\]
poisson_non_hierarchical <- stan_glm(
eviction_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
uninsured_count,
data = nyc_clean,
family = poisson,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_non_hierarchical) +
xlab("Eviction Count") +
xlim(0,max(nyc_clean$eviction_count))+
labs(title = "Poisson")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_poisson <- posterior_predict(
poisson_non_hierarchical, newdata = nyc_predict_clean)
library(tidybayes)
library(bayesplot)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(poisson_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchical Poisson - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 55416.1150639 | 1.1562444 | 83.6981133 | 1.189474e+05 |
| mean_rent | -0.1833582 | 0.0003201 | -0.3955940 | -3.026100e-03 |
| unemployment_count | 0.0646197 | 0.0007333 | 0.0137323 | 1.156050e-01 |
| white_count | -0.0122045 | 0.0001759 | -0.0245155 | -3.029000e-04 |
| asian_count | -0.0063640 | 0.0000729 | -0.0114315 | -4.509000e-04 |
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \; \; \; \; \text{where} \log(\mu_{i}) = \beta_{0} + \sum^{11}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \end{split}\]
negbin_non_hierarchical <- stan_glm(
eviction_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
uninsured_count,
data = nyc_clean,
family = neg_binomial_2,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(negbin_non_hierarchical) +
xlab("Eviction Count") +
xlim(0,max(nyc_clean$eviction_count))+
labs(title = "Negative Binomial")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_negbin <- posterior_predict(
negbin_non_hierarchical, newdata = nyc_predict_clean)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 30.5453872 | 0.5287620 | 15.0869470 | 59.1729909 |
| gini | 2418.1203641 | 1.2661393 | 475.1531497 | 10412.0972367 |
| unemployment_count | 0.0200076 | 0.0000906 | 0.0088465 | 0.0317101 |
| black_count | 0.0035390 | 0.0000074 | 0.0025635 | 0.0043668 |
| latinx_count | 0.0029908 | 0.0000078 | 0.0018979 | 0.0041004 |
| transportation_desert_4catLimited | 28.8465194 | 0.1195558 | 9.1149525 | 50.1353776 |
| transportation_desert_4catSatisfactory | 17.4168094 | 0.1448793 | 0.4915411 | 41.4977460 |
| transportation_desert_4catExcellent | 32.1688719 | 0.1484052 | 10.7543255 | 59.1151327 |
| school_count | -1.7919412 | 0.0090439 | -2.9495273 | -0.5244692 |
| store_count | 0.2906715 | 0.0021231 | 0.0647539 | 0.5353725 |
table1 <- prediction_summary(model=poisson_non_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Poisson")
table2 <- prediction_summary(model=negbin_non_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Negative Binomial")
Negative binomial has much better error metrics.
We use poisson and negative binomial hierarchical regression to model observed white counts, \(i\), by boroughs \(j\), usin our \(k=11\) predictors. Note we are creating 3 dummy variables associated with
transportation_desert_4cat:
- mean_income
- mean_rent
- unemployment_count
- transportation_desert_4cat
- school_count
- store_count
- bus_count
- eviction_count
- uninsured_count
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{ij}) \\ & \text{where} \log(\lambda_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \sigma_0 & \sim Exp(1) \end{split}\]
poisson_hierarchical <- stan_glmer(
eviction_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
uninsured_count + (1 | borough),
data = nyc_clean,
family = poisson,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_hierarchical) +
xlab("Eviction Count") +
xlim(0,max(nyc_clean$eviction_count))+
labs(title = "Poisson")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_poisson <- posterior_predict(
poisson_hierarchical, newdata = nyc_predict_clean)
library(tidybayes)
library(bayesplot)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(poisson_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Hierarchicahl Poisson - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 809.6633508 | 4.5449121 | 21.8424000 | 63684.3915131 |
| mean_income | -0.0005827 | 0.0000016 | -0.0010228 | -0.0002919 |
| mean_rent | -0.1117542 | 0.0015536 | -0.2276449 | -0.0028474 |
| unemployment_count | 0.0471095 | 0.0005280 | 0.0117567 | 0.1071384 |
| asian_count | -0.0066243 | 0.0000934 | -0.0130425 | -0.0002948 |
| transportation_desert_4catLimited | 27.8311051 | 0.0345974 | 21.3364953 | 31.8488463 |
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \\ & \text{where} \log(\mu_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \sigma_0 & \sim Exp(1) \end{split}\]
negbin_hierarchical <- stan_glmer(
eviction_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
uninsured_count+ (1 | borough),
data = nyc_clean,
family = neg_binomial_2,
chains = 2, iter = 100*2, seed = 84735, refresh = 0)
pp_check(negbin_hierarchical) +
xlab("Eviction Count") +
xlim(0,max(nyc_clean$eviction_count))+
labs(title = "Negative Binomial")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_negbin <- posterior_predict(
negbin_hierarchical, newdata = nyc_predict_clean)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(negbin_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 53.0835129 | 0.7801604 | 20.5159984 | 139.4805573 |
| gini | 700.8616334 | 1.5425202 | 27.5405436 | 4094.9143873 |
| mean_rent | -0.0353251 | 0.0002534 | -0.0621987 | -0.0022165 |
| white_count | 0.0005786 | 0.0000039 | 0.0000139 | 0.0011426 |
| black_count | 0.0037818 | 0.0000060 | 0.0031138 | 0.0046892 |
| latinx_count | 0.0015668 | 0.0000084 | 0.0006714 | 0.0026564 |
| transportation_desert_4catLimited | 29.4117040 | 0.1157931 | 12.1793852 | 48.5268663 |
| transportation_desert_4catSatisfactory | 22.8799647 | 0.1265743 | 2.4483039 | 42.1049581 |
| transportation_desert_4catExcellent | 41.8088079 | 0.1470004 | 18.2539168 | 70.2206680 |
| school_count | -1.3885206 | 0.0088653 | -2.5049046 | -0.2472096 |
| store_count | 0.4476516 | 0.0019607 | 0.2200616 | 0.7065064 |
col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
moran.mc(col_sp$eviction_count, listw = col_listw, nsim = 999, alternative = "greater")
##
## Monte-Carlo simulation of Moran I
##
## data: col_sp$eviction_count
## weights: col_listw
## number of simulations + 1: 1000
##
## statistic = 0.47185, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
M.burnin <- 10000 # Number of burn-in iterations (discarded)
M <- 1000 # Number of iterations retained
model.eviction <- S.CARleroux(
eviction_count ~ 1 + gini +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + uninsured_count,
data = nyc_clean,
family = "poisson",
W = W,
burnin = M.burnin,
n.sample = M.burnin + M, # Total iterations
verbose = FALSE)
as.data.frame(model.eviction$summary.results) %>%
rownames_to_column("term") %>%
kable()%>%
kable_styling() %>%
scroll_box(width = "100%", height = "200px")
| term | Median | 2.5% | 97.5% | n.sample | % accept | n.effective | Geweke.diag |
|---|---|---|---|---|---|---|---|
| (Intercept) | 0.9732 | 0.4840 | 1.2552 | 1000 | 44.0 | 2.6 | 3.8 |
| gini | 2.8176 | 2.2955 | 3.7680 | 1000 | 44.0 | 2.7 | -3.2 |
| unemployment_count | 0.0007 | 0.0007 | 0.0008 | 1000 | 44.0 | 2.3 | 7.1 |
| white_count | 0.0000 | 0.0000 | 0.0000 | 1000 | 44.0 | 4.7 | -1.3 |
| black_count | 0.0000 | 0.0000 | 0.0000 | 1000 | 44.0 | 4.0 | 2.0 |
| latinx_count | 0.0000 | 0.0000 | 0.0001 | 1000 | 44.0 | 7.8 | 1.7 |
| asian_count | 0.0000 | 0.0000 | 0.0000 | 1000 | 44.0 | 3.4 | -4.7 |
| transportation_desert_4catLimited | 0.0251 | -0.0213 | 0.0847 | 1000 | 44.0 | 3.9 | -2.4 |
| transportation_desert_4catSatisfactory | 0.3465 | 0.3192 | 0.3670 | 1000 | 44.0 | 8.3 | 1.4 |
| transportation_desert_4catExcellent | 0.1966 | 0.1065 | 0.2292 | 1000 | 44.0 | 3.4 | 1.1 |
| school_count | 0.0074 | 0.0009 | 0.0174 | 1000 | 44.0 | 2.6 | -5.4 |
| store_count | 0.0087 | 0.0078 | 0.0093 | 1000 | 44.0 | 5.6 | 0.1 |
| uninsured_count | -0.0001 | -0.0001 | -0.0001 | 1000 | 44.0 | 1.7 | -1.9 |
| tau2 | 2.7575 | 1.9175 | 3.8906 | 1000 | 100.0 | 64.7 | -1.2 |
| rho | 0.3607 | 0.1643 | 0.5784 | 1000 | 49.5 | 57.1 | -0.7 |
p_plot <- round((moran.mc(x = as.vector(model.eviction$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
nyc_clean %>%
mutate(resid = model.eviction$residuals$response) %>%
ggplot(aes(fill = resid), color = "#8f98aa") +
geom_sf()+
scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
guide_legend(title = "Residual")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle(paste0("Eviction: \nPoisson Model Residuals (", p_plot, ")"))+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 25, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
draft <- nyc_clean
col_sp <- as(draft, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
M.burnin <- 10000 # Number of burn-in iterations (discarded)
M <- 1000 # Number of iterations retained
model.eviction <- S.CARleroux(
eviction_count ~ 1 + gini +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + uninsured_count,
data = nyc_clean,
family = "zip",
formula.omega = ~1,
W = W,
burnin = M.burnin,
n.sample = M.burnin + M, # Total iterations
verbose = FALSE)
as.data.frame(model.eviction$summary.results) %>%
rownames_to_column("term") %>%
filter(Median != 0 ) %>%
filter(`2.5%` > 0 & `97.5%` > 0 | `2.5%` < 0 & `97.5%` < 0) %>%
kable()%>%
kable_styling() %>%
scroll_box(width = "100%", height = "200px")
| term | Median | 2.5% | 97.5% | n.sample | % accept | n.effective | Geweke.diag |
|---|---|---|---|---|---|---|---|
| (Intercept) | 3.0805 | 2.8014 | 3.4537 | 1000 | 46.0 | 2.9 | 5.0 |
| unemployment_count | 0.0004 | 0.0004 | 0.0005 | 1000 | 46.0 | 1.4 | -1.8 |
| asian_count | 0.0001 | 0.0001 | 0.0001 | 1000 | 46.0 | 9.8 | 2.3 |
| transportation_desert_4catLimited | -0.2499 | -0.3303 | -0.1594 | 1000 | 46.0 | 1.2 | -5.9 |
| transportation_desert_4catSatisfactory | -0.4019 | -0.4304 | -0.3651 | 1000 | 46.0 | 7.6 | 4.9 |
| transportation_desert_4catExcellent | -0.2977 | -0.3433 | -0.2393 | 1000 | 46.0 | 3.2 | -0.5 |
| school_count | -0.0212 | -0.0244 | -0.0149 | 1000 | 46.0 | 8.9 | -2.2 |
| store_count | 0.0108 | 0.0098 | 0.0122 | 1000 | 46.0 | 2.9 | -4.2 |
| omega - (Intercept) | 46600.1794 | 46600.1794 | 46600.1794 | 1000 | 0.0 | 0.0 | NaN |
| tau2 | 4.1346 | 3.0057 | 5.6135 | 1000 | 100.0 | 112.3 | -1.8 |
| rho | 0.6653 | 0.3936 | 0.9241 | 1000 | 49.9 | 86.6 | -0.4 |
# p_plot <- round((moran.mc(x = as.vector(model.eviction$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
nyc_clean %>%
mutate(resid = model.eviction$residuals$response) %>%
ggplot(aes(fill = resid), color = "#8f98aa") +
geom_sf()+
scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
guide_legend(title = "Residual")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle(paste0("Eviction: \nZIP Model Residuals ( p_plot)"))+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 25, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
table3 <- prediction_summary(model=poisson_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Hierarchichal Poisson")
table4 <- prediction_summary(model=negbin_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Hierarchichal Negative Binomial")
rbind(table1, table2, table3, table4) %>%
dplyr::mutate(model = Model, .before=1) %>%
dplyr::select(-Model)%>% kable() %>% kable_styling()
| model | mae | mae_scaled | within_50 | within_95 |
|---|---|---|---|---|
| Poisson | 124.1150 | 1.1197121 | 0.5083333 | 0.6416667 |
| Negative Binomial | 58.9475 | 0.4954948 | 0.6166667 | 0.9666667 |
| Hierarchichal Poisson | 107.6275 | 0.9523888 | 0.5333333 | 0.6666667 |
| Hierarchichal Negative Binomial | 55.4150 | 0.5521519 | 0.5833333 | 0.9833333 |
Using in-sample scaled MAE, it’s evident our negative binomial regression models, regardless of hierarchy, performed better than our poisson regression models. However, the differences between the two negative binomial models was neglible. So, we will more closely interpret the non-hierarchichal negative binomial regression model.
tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 30.5453872 | 0.5287620 | 15.0869470 | 59.1729909 |
| gini | 2418.1203641 | 1.2661393 | 475.1531497 | 10412.0972367 |
| unemployment_count | 0.0200076 | 0.0000906 | 0.0088465 | 0.0317101 |
| black_count | 0.0035390 | 0.0000074 | 0.0025635 | 0.0043668 |
| latinx_count | 0.0029908 | 0.0000078 | 0.0018979 | 0.0041004 |
| transportation_desert_4catLimited | 28.8465194 | 0.1195558 | 9.1149525 | 50.1353776 |
| transportation_desert_4catSatisfactory | 17.4168094 | 0.1448793 | 0.4915411 | 41.4977460 |
| transportation_desert_4catExcellent | 32.1688719 | 0.1484052 | 10.7543255 | 59.1151327 |
| school_count | -1.7919412 | 0.0090439 | -2.9495273 | -0.5244692 |
| store_count | 0.2906715 | 0.0021231 | 0.0647539 | 0.5353725 |
After removing predictors whose 95% credible intervals included the possibility of non-effect when controlling for other covariates, we found that there were 6 remaining predictors of an arbitrary neighborhood’s felony eviction counts:
- Mean income by neighborhood
- Unemployment counts by neighborhood
- Transportation desert status by neighborhood
- Food Vendor counts by neighborhood
- Bus station counts by neighborhood
- Eviction counts by neighborhood.
Next, we interpret each predictor:
- Mean Income: When controlling for all other predictors, 1 dollar increases in mean neighborhood rental prices are associated with approximately 0.00143% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.000330%, 0.00268%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
- Unemployment Count: When controlling for all other predictors, 1 dollar increases in unemployed people counts by neighborhood are associated with approximately 0.0493% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0140%, 0.0790%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Limited Subway Access: When controlling for all other predictors, a neighborhood with limited access to subway transit is expected to have approximately 125.753% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (51.508%, 269.541%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Satisfactory Subway Access: When controlling for all other predictors, a neighborhood with satisfactory access to subway transit is expected to have approximately 136.342% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (44.081%, 282.669%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Excellent Subway Access: When controlling for all other predictors, a neighborhood with excellent access to subway transit is expected to have approximately 98.186% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (15.762%, 275.969%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Food Vendor Count: When controlling for all other predictors, 1 store increases in the number of food vendors by neighborhood are associated with approximately 0.759% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.261%, 1.469%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Bus Stop Count: When controlling for all other predictors, 1 stop increases in the number of bus stop by neighborhood are associated with approximately 0.6516973% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0402706%, 1.1496164%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Eviction Count: When controlling for all other predictors, increases in the number of evictions by neighborhood are associated with approximately 0.340% decreases in the white population count. However, there is a 95% chance that the relationship may be any value between (-0.425%, -0.248%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
If we intended to essentialize this list,
transportation_desert_4,eviction_count,store_countseem to be the most informative predictors.
We use poisson and negative binomial regression to model observed assault counts, \(i\), using our \(k=10\) predictors:
- gini
- mean_income
- white_count
- black_count
- latinx_count
- asian_count
- mean_rent
- unemployment_count
- sub_count
- transportation_desert_4cat
- school_count
- store_count
- bus_count
- eviction_count
- uninsured_count
\[\begin{split} \text{White}_{i} \mid \beta_{0}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{i}) \; \; \; \; \text{where} \log(\lambda_{i}) = \beta_{0} + \sum^{10}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \end{split}\]
poisson_non_hierarchical <- stan_glm(
assault_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count,
data = nyc_clean,
family = poisson,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_non_hierarchical) +
xlab("Felon Assault Count") +
xlim(0,max(nyc_clean$assault_count))+
labs(title = "Poisson")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_poisson <- posterior_predict(
poisson_non_hierarchical, newdata = nyc_predict_clean)
library(tidybayes)
library(bayesplot)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(poisson_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchical Poisson - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 3.5864689 | 0.1983254 | 2.7327398 | 4.6456268 |
| gini | 5396.0460106 | 0.4182073 | 3285.4646368 | 9804.7747080 |
| mean_income | 0.0002932 | 0.0000012 | 0.0001445 | 0.0004605 |
| mean_rent | -0.0448781 | 0.0000858 | -0.0565014 | -0.0344047 |
| white_count | -0.0014579 | 0.0000014 | -0.0016540 | -0.0012487 |
| latinx_count | 0.0007796 | 0.0000025 | 0.0003967 | 0.0010653 |
| transportation_desert_4catLimited | 60.2880345 | 0.0531195 | 50.1488375 | 69.0748289 |
| transportation_desert_4catSatisfactory | 29.7228730 | 0.0601048 | 20.0323022 | 39.9971227 |
| transportation_desert_4catExcellent | 73.5251029 | 0.0594743 | 60.9106951 | 85.1933438 |
| store_count | 0.5518890 | 0.0005727 | 0.4844971 | 0.6122244 |
| bus_count | 0.5740996 | 0.0004254 | 0.5258629 | 0.6227106 |
| eviction_count | 0.0421430 | 0.0000887 | 0.0312818 | 0.0558097 |
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \; \; \; \; \text{where} \log(\mu_{i}) = \beta_{0} + \sum^{11}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \end{split}\]
negbin_non_hierarchical <- stan_glm(
assault_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count,
data = nyc_clean,
family = neg_binomial_2,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(negbin_non_hierarchical) +
xlab("Felony Assualt Count") +
xlim(0,max(nyc_clean$assault_count))+
labs(title = "Negative Binomial")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_negbin <- posterior_predict(
negbin_non_hierarchical, newdata = nyc_predict_clean)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 2.1998363 | 0.8420645 | 0.6470274 | 7.362050e+00 |
| gini | 9155.7878631 | 1.9906985 | 541.5958625 | 1.066761e+05 |
| mean_income | 0.0006529 | 0.0000046 | 0.0001557 | 1.293700e-03 |
| mean_rent | -0.0700853 | 0.0003057 | -0.1100915 | -3.038670e-02 |
| white_count | -0.0016060 | 0.0000072 | -0.0026489 | -7.126000e-04 |
| transportation_desert_4catLimited | 70.4209661 | 0.1998572 | 33.6839614 | 1.305728e+02 |
| transportation_desert_4catExcellent | 87.7150816 | 0.2308285 | 29.8265673 | 1.650792e+02 |
| school_count | 1.7349760 | 0.0131659 | 0.1842995 | 3.408764e+00 |
| store_count | 0.7203276 | 0.0033028 | 0.3282823 | 1.123578e+00 |
| bus_count | 0.7038286 | 0.0023984 | 0.3565005 | 9.673685e-01 |
table1 <- prediction_summary(model=poisson_non_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Poisson")
table2 <- prediction_summary(model=negbin_non_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Negative Binomial")
Negative binomial has much better error metrics.
We use poisson and negative binomial hierarchical regression to model observed white counts, \(i\), by boroughs \(j\), usin our \(k=11\) predictors. Note we are creating 3 dummy variables associated with
transportation_desert_4cat:
- mean_income
- mean_rent
- unemployment_count
- transportation_desert_4cat
- school_count
- store_count
- bus_count
- eviction_count
- uninsured_count
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{ij}) \\ & \text{where} \log(\lambda_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \sigma_0 & \sim Exp(1) \end{split}\]
poisson_hierarchical <- stan_glmer(
assault_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count + (1 | borough),
data = nyc_clean,
family = poisson,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_hierarchical) +
xlab("Felony Assault Count") +
xlim(0,max(nyc_clean$assault_count))+
labs(title = "Poisson")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_poisson <- posterior_predict(
poisson_hierarchical, newdata = nyc_predict_clean)
library(tidybayes)
library(bayesplot)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(poisson_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Hierarchicahl Poisson - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 2.604147e+05 | 13.9154929 | 6.2855040 | 2.641469e+09 |
| unemployment_count | -1.019620e-02 | 0.0000182 | -0.0112094 | -4.090700e-03 |
| white_count | -1.161110e-02 | 0.0001479 | -0.0215374 | -1.258900e-03 |
| school_count | 5.656243e+00 | 0.0685694 | 0.4419139 | 1.066438e+01 |
| bus_count | 3.083773e+00 | 0.0363440 | 0.5476908 | 5.656987e+00 |
| eviction_count | 2.645923e-01 | 0.0029338 | 0.0488013 | 4.619764e-01 |
| uninsured_count | 5.206250e-02 | 0.0007241 | 0.0008345 | 1.009630e-01 |
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \\ & \text{where} \log(\mu_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \sigma_0 & \sim Exp(1) \end{split}\]
negbin_hierarchical <- stan_glmer(
assault_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count+ (1 | borough),
data = nyc_clean,
family = neg_binomial_2,
chains = 2, iter = 100*2, seed = 84735, refresh = 0)
pp_check(negbin_hierarchical) +
xlab("Felony Assault Count") +
xlim(0,max(nyc_clean$assault_count))+
labs(title = "Negative Binomial")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_negbin <- posterior_predict(
negbin_hierarchical, newdata = nyc_predict_clean)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(negbin_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 5.3270832 | 1.0967411 | 1.6974017 | 23.5092739 |
| mean_rent | -0.0739410 | 0.0003625 | -0.1154820 | -0.0266327 |
| white_count | -0.0014585 | 0.0000071 | -0.0024297 | -0.0006557 |
| transportation_desert_4catLimited | 80.6050068 | 0.1977024 | 38.4474675 | 132.9241017 |
| transportation_desert_4catExcellent | 101.3320462 | 0.2694313 | 41.4451363 | 179.5988054 |
| school_count | 2.3608552 | 0.0140133 | 0.6749783 | 4.1613701 |
| store_count | 0.8449207 | 0.0030550 | 0.4437088 | 1.2270165 |
| bus_count | 0.6870714 | 0.0028121 | 0.3789878 | 1.0114716 |
col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
moran.mc(col_sp$assault_count, listw = col_listw, nsim = 999, alternative = "greater")
##
## Monte-Carlo simulation of Moran I
##
## data: col_sp$assault_count
## weights: col_listw
## number of simulations + 1: 1000
##
## statistic = 0.19199, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
M.burnin <- 10000 # Number of burn-in iterations (discarded)
M <- 1000 # Number of iterations retained
model.assault <- S.CARleroux(
assault_count ~ 1 + gini +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + eviction_count + uninsured_count,
data = nyc_clean,
family = "poisson",
W = W,
burnin = M.burnin,
n.sample = M.burnin + M, # Total iterations
verbose = FALSE)
as.data.frame(model.assault$summary.results) %>%
rownames_to_column("term") %>%
kable()%>%
kable_styling() %>%
scroll_box(width = "100%", height = "200px")
| term | Median | 2.5% | 97.5% | n.sample | % accept | n.effective | Geweke.diag |
|---|---|---|---|---|---|---|---|
| (Intercept) | -1.9999 | -2.6628 | -1.3056 | 1000 | 44.6 | 4.8 | 0.8 |
| gini | 7.2529 | 5.6065 | 8.4223 | 1000 | 44.6 | 4.6 | -0.4 |
| unemployment_count | 0.0000 | -0.0001 | 0.0001 | 1000 | 44.6 | 6.2 | -4.8 |
| white_count | 0.0000 | 0.0000 | 0.0000 | 1000 | 44.6 | 2.5 | -0.7 |
| black_count | 0.0000 | 0.0000 | 0.0000 | 1000 | 44.6 | 3.8 | 0.8 |
| latinx_count | 0.0000 | 0.0000 | 0.0001 | 1000 | 44.6 | 1.6 | 6.9 |
| asian_count | 0.0000 | 0.0000 | 0.0000 | 1000 | 44.6 | 10.5 | -2.0 |
| transportation_desert_4catLimited | 0.4898 | 0.3948 | 0.6482 | 1000 | 44.6 | 1.8 | 6.0 |
| transportation_desert_4catSatisfactory | 0.1118 | -0.0032 | 0.2352 | 1000 | 44.6 | 3.2 | -3.7 |
| transportation_desert_4catExcellent | 0.3385 | 0.2085 | 0.4497 | 1000 | 44.6 | 2.1 | -9.4 |
| school_count | 0.0090 | -0.0045 | 0.0174 | 1000 | 44.6 | 3.9 | -3.2 |
| store_count | 0.0047 | 0.0030 | 0.0070 | 1000 | 44.6 | 2.7 | -1.9 |
| eviction_count | 0.0004 | 0.0000 | 0.0005 | 1000 | 44.6 | 4.1 | 2.3 |
| uninsured_count | 0.0000 | 0.0000 | 0.0000 | 1000 | 44.6 | 10.1 | -2.1 |
| tau2 | 0.8795 | 0.6206 | 1.3123 | 1000 | 100.0 | 57.0 | 1.5 |
| rho | 0.0651 | 0.0040 | 0.1934 | 1000 | 44.9 | 57.7 | 1.2 |
p_plot <- round((moran.mc(x = as.vector(model.assault$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
nyc_clean %>%
mutate(resid = model.assault$residuals$response) %>%
ggplot(aes(fill = resid), color = "#8f98aa") +
geom_sf()+
scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
guide_legend(title = "Residual")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle(paste0("1st and Degree Felony Assault: \nPoisson Model Residuals (", p_plot, ")"))+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 25, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
draft <- nyc_clean
col_sp <- as(draft, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
M.burnin <- 10000 # Number of burn-in iterations (discarded)
M <- 1000 # Number of iterations retained
model.assault <- S.CARleroux(
assault_count ~ 1 + gini +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + eviction_count + uninsured_count,
data = nyc_clean,
family = "zip",
formula.omega = ~1,
W = W,
burnin = M.burnin,
n.sample = M.burnin + M, # Total iterations
verbose = FALSE)
as.data.frame(model.assault$summary.results) %>%
rownames_to_column("term") %>%
filter(Median != 0 ) %>%
filter(`2.5%` > 0 & `97.5%` > 0 | `2.5%` < 0 & `97.5%` < 0) %>%
kable()%>%
kable_styling() %>%
scroll_box(width = "100%", height = "200px")
| term | Median | 2.5% | 97.5% | n.sample | % accept | n.effective | Geweke.diag |
|---|---|---|---|---|---|---|---|
| (Intercept) | -1.9043 | -3.1216 | -1.1967 | 1000 | 47.0 | 2.9 | 0.7 |
| gini | 7.0408 | 5.0297 | 9.3690 | 1000 | 47.0 | 2.8 | -0.4 |
| latinx_count | 0.0001 | 0.0001 | 0.0001 | 1000 | 47.0 | 10.8 | 2.6 |
| transportation_desert_4catLimited | 0.4587 | 0.3100 | 0.5598 | 1000 | 47.0 | 2.1 | -9.1 |
| transportation_desert_4catSatisfactory | 0.1377 | 0.0744 | 0.1956 | 1000 | 47.0 | 7.5 | 2.3 |
| transportation_desert_4catExcellent | 0.2915 | 0.1986 | 0.3877 | 1000 | 47.0 | 3.3 | 4.8 |
| school_count | -0.0153 | -0.0267 | -0.0041 | 1000 | 47.0 | 3.2 | -5.0 |
| store_count | 0.0131 | 0.0110 | 0.0150 | 1000 | 47.0 | 4.6 | 2.7 |
| eviction_count | 0.0006 | 0.0003 | 0.0008 | 1000 | 47.0 | 3.9 | 1.0 |
| uninsured_count | -0.0002 | -0.0002 | -0.0002 | 1000 | 47.0 | 3.1 | -10.5 |
| omega - (Intercept) | -11809.4038 | -11809.4038 | -11809.4038 | 1000 | 0.0 | 0.0 | NaN |
| tau2 | 0.8459 | 0.6464 | 1.1860 | 1000 | 100.0 | 119.5 | 0.2 |
| rho | 0.0452 | 0.0029 | 0.1449 | 1000 | 47.9 | 95.9 | 0.2 |
# p_plot <- round((moran.mc(x = as.vector(model.assault$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
# nyc_clean %>%
# mutate(resid = model.assault$residuals$response) %>%
# ggplot(aes(fill = resid), color = "#8f98aa") +
# geom_sf()+
# scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
# guide_legend(title = "Residual")) +
# theme_minimal() +
# theme(panel.grid.major = element_line("transparent"),
# axis.text = element_blank()) +
# ggtitle(paste0("1st and Degree Felony Assault: \nZIP Model Residuals (", p_plot, ")"))+
# theme(panel.grid.major = element_line("transparent"),
# plot.title = element_text(size = 25, face = "bold"),
# legend.title = element_text(size = 12),
# legend.text = element_text(size = 12)) +
# guides(shape = guide_legend(override.aes = list(size = 8)),
# color = guide_legend(override.aes = list(size = 8)))
table3 <- prediction_summary(model=poisson_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Hierarchichal Poisson")
table4 <- prediction_summary(model=negbin_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Hierarchichal Negative Binomial")
rbind(table1, table2, table3, table4) %>%
dplyr::mutate(model = Model, .before=1) %>%
dplyr::select(-Model)%>% kable() %>% kable_styling()
| model | mae | mae_scaled | within_50 | within_95 |
|---|---|---|---|---|
| Poisson | 19.7075 | 2.7633695 | 0.1416667 | 0.3333333 |
| Negative Binomial | 20.6150 | 0.6005202 | 0.5583333 | 0.9833333 |
| Hierarchichal Poisson | 29.5525 | 0.9424668 | 0.5500000 | 0.6500000 |
| Hierarchichal Negative Binomial | 20.4425 | 0.6541777 | 0.5000000 | 0.9666667 |
Using in-sample scaled MAE, it’s evident our negative binomial regression models, regardless of hierarchy, performed better than our poisson regression models. However, the differences between the two negative binomial models was neglible. So, we will more closely interpret the non-hierarchichal negative binomial regression model.
tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 2.1998363 | 0.8420645 | 0.6470274 | 7.362050e+00 |
| gini | 9155.7878631 | 1.9906985 | 541.5958625 | 1.066761e+05 |
| mean_income | 0.0006529 | 0.0000046 | 0.0001557 | 1.293700e-03 |
| mean_rent | -0.0700853 | 0.0003057 | -0.1100915 | -3.038670e-02 |
| white_count | -0.0016060 | 0.0000072 | -0.0026489 | -7.126000e-04 |
| transportation_desert_4catLimited | 70.4209661 | 0.1998572 | 33.6839614 | 1.305728e+02 |
| transportation_desert_4catExcellent | 87.7150816 | 0.2308285 | 29.8265673 | 1.650792e+02 |
| school_count | 1.7349760 | 0.0131659 | 0.1842995 | 3.408764e+00 |
| store_count | 0.7203276 | 0.0033028 | 0.3282823 | 1.123578e+00 |
| bus_count | 0.7038286 | 0.0023984 | 0.3565005 | 9.673685e-01 |
After removing predictors whose 95% credible intervals included the possibility of non-effect when controlling for other covariates, we found that there were 6 remaining predictors of an arbitrary neighborhood’s felony assault counts:
- Mean income by neighborhood
- Unemployment counts by neighborhood
- Transportation desert status by neighborhood
- Food Vendor counts by neighborhood
- Bus station counts by neighborhood
- Eviction counts by neighborhood.
Next, we interpret each predictor:
- Mean Income: When controlling for all other predictors, 1 dollar increases in mean neighborhood rental prices are associated with approximately 0.00143% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.000330%, 0.00268%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
- Unemployment Count: When controlling for all other predictors, 1 dollar increases in unemployed people counts by neighborhood are associated with approximately 0.0493% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0140%, 0.0790%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Limited Subway Access: When controlling for all other predictors, a neighborhood with limited access to subway transit is expected to have approximately 125.753% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (51.508%, 269.541%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Satisfactory Subway Access: When controlling for all other predictors, a neighborhood with satisfactory access to subway transit is expected to have approximately 136.342% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (44.081%, 282.669%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Excellent Subway Access: When controlling for all other predictors, a neighborhood with excellent access to subway transit is expected to have approximately 98.186% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (15.762%, 275.969%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Food Vendor Count: When controlling for all other predictors, 1 store increases in the number of food vendors by neighborhood are associated with approximately 0.759% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.261%, 1.469%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Bus Stop Count: When controlling for all other predictors, 1 stop increases in the number of bus stop by neighborhood are associated with approximately 0.6516973% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0402706%, 1.1496164%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Eviction Count: When controlling for all other predictors, increases in the number of evictions by neighborhood are associated with approximately 0.340% decreases in the white population count. However, there is a 95% chance that the relationship may be any value between (-0.425%, -0.248%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
If we intended to essentialize this list,
transportation_desert_4,eviction_count,store_countseem to be the most informative predictors.
We use poisson and negative binomial regression to model observed assault counts, \(i\), using our \(k=10\) predictors:
- gini
- mean_income
- white_count
- black_count
- latinx_count
- asian_count
- mean_rent
- unemployment_count
- sub_count
- transportation_desert_4cat
- school_count
- store_count
- bus_count
- eviction_count
- uninsured_count
\[\begin{split} \text{White}_{i} \mid \beta_{0}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{i}) \; \; \; \; \text{where} \log(\lambda_{i}) = \beta_{0} + \sum^{10}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \end{split}\]
poisson_non_hierarchical <- stan_glm(
sex_crime_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count,
data = nyc_clean,
family = poisson,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_non_hierarchical) +
xlab("Sex-Based Crimes Count") +
xlim(0,max(nyc_clean$sex_crime_count))+
labs(title = "Poisson")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_poisson <- posterior_predict(
poisson_non_hierarchical, newdata = nyc_predict_clean)
library(tidybayes)
library(bayesplot)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(poisson_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchical Poisson - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 0.4120347 | 0.4006631 | 0.2273258 | 0.6758560 |
| gini | 3587.9061283 | 0.8780549 | 1198.0715235 | 11249.5836798 |
| mean_income | -0.0006794 | 0.0000028 | -0.0009775 | -0.0001463 |
| unemployment_count | -0.0354126 | 0.0000596 | -0.0422560 | -0.0281694 |
| white_count | 0.0013926 | 0.0000022 | 0.0010219 | 0.0017033 |
| black_count | 0.0019501 | 0.0000041 | 0.0013431 | 0.0024266 |
| latinx_count | 0.0039675 | 0.0000048 | 0.0031338 | 0.0046113 |
| asian_count | 0.0054798 | 0.0000048 | 0.0045999 | 0.0060160 |
| transportation_desert_4catLimited | 173.1518336 | 0.1147418 | 140.5511570 | 219.2276179 |
| transportation_desert_4catSatisfactory | 84.1372776 | 0.1358524 | 54.0562285 | 117.7787728 |
| transportation_desert_4catExcellent | 201.2982840 | 0.1286909 | 161.8717676 | 258.7494905 |
| school_count | 0.6618839 | 0.0046520 | 0.0790657 | 1.4987590 |
| store_count | 0.1256186 | 0.0010681 | 0.0109349 | 0.2731614 |
| bus_count | 0.8590682 | 0.0007970 | 0.7546579 | 0.9676299 |
| eviction_count | 0.0772791 | 0.0001721 | 0.0571873 | 0.1019295 |
| uninsured_count | -0.0117684 | 0.0000217 | -0.0142757 | -0.0081621 |
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \; \; \; \; \text{where} \log(\mu_{i}) = \beta_{0} + \sum^{11}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \end{split}\]
negbin_non_hierarchical <- stan_glm(
sex_crime_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count,
data = nyc_clean,
family = neg_binomial_2,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(negbin_non_hierarchical) +
xlab("Sex-Based Crime Count") +
xlim(0,max(nyc_clean$sex_crime_count))+
labs(title = "Negative Binomial")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_negbin <- posterior_predict(
negbin_non_hierarchical, newdata = nyc_predict_clean)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 0.1412859 | 1.6773478 | 0.0205149 | 1.487618e+00 |
| gini | 83112.5360342 | 3.3742642 | 699.4828738 | 5.658686e+06 |
| mean_rent | -0.0690438 | 0.0005387 | -0.1351984 | -4.143100e-03 |
| latinx_count | 0.0047039 | 0.0000287 | 0.0007911 | 7.644200e-03 |
| asian_count | 0.0068104 | 0.0000284 | 0.0029518 | 9.905500e-03 |
| transportation_desert_4catLimited | 227.3774554 | 0.4375992 | 88.3039435 | 4.952490e+02 |
| transportation_desert_4catExcellent | 238.0095508 | 0.4822838 | 82.4135280 | 5.960894e+02 |
| store_count | 0.8100002 | 0.0069962 | 0.0412584 | 1.773830e+00 |
| uninsured_count | -0.0171355 | 0.0001329 | -0.0318871 | -1.290800e-03 |
table1 <- prediction_summary(model=poisson_non_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Poisson")
table2 <- prediction_summary(model=negbin_non_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Negative Binomial")
Negative binomial has much better error metrics.
We use poisson and negative binomial hierarchical regression to model observed white counts, \(i\), by boroughs \(j\), usin our \(k=11\) predictors. Note we are creating 3 dummy variables associated with
transportation_desert_4cat:
- mean_income
- mean_rent
- unemployment_count
- transportation_desert_4cat
- school_count
- store_count
- bus_count
- eviction_count
- uninsured_count
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{ij}) \\ & \text{where} \log(\lambda_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \sigma_0 & \sim Exp(1) \end{split}\]
poisson_hierarchical <- stan_glmer(
sex_crime_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count + (1 | borough),
data = nyc_clean,
family = poisson,
chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_hierarchical) +
xlab("Sex-Based Crime Count") +
xlim(0,max(nyc_clean$sex_crime_count))+
labs(title = "Poisson")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_poisson <- posterior_predict(
poisson_hierarchical, newdata = nyc_predict_clean)
library(tidybayes)
library(bayesplot)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(poisson_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Hierarchicahl Poisson - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 9088.3043289 | 18.5470740 | 0.0134379 | 2.953498e+09 |
| unemployment_count | -0.0126650 | 0.0000227 | -0.0331440 | -1.122810e-02 |
| asian_count | 0.0067423 | 0.0000331 | 0.0038719 | 1.001290e-02 |
| bus_count | 2.9602367 | 0.0349820 | 0.4975013 | 5.678168e+00 |
\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \\ & \text{where} \log(\mu_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \sigma_0 & \sim Exp(1) \end{split}\]
negbin_hierarchical <- stan_glmer(
sex_crime_count ~ gini + mean_income + mean_rent +
unemployment_count + white_count + black_count + latinx_count + asian_count+
transportation_desert_4cat + school_count +
store_count + bus_count +
eviction_count + uninsured_count+ (1 | borough),
data = nyc_clean,
family = neg_binomial_2,
chains = 2, iter = 100*2, seed = 84735, refresh = 0)
pp_check(negbin_hierarchical) +
xlab("Sex-Based Crime Count") +
xlim(0,max(nyc_clean$sex_crime_count))+
labs(title = "Negative Binomial")+
theme(plot.title = element_text(face="bold", size=25, hjust=.5))
nyc_predict_clean <- nyc_clean %>%
na.omit()
set.seed(84735)
predictions_negbin <- posterior_predict(
negbin_hierarchical, newdata = nyc_predict_clean)
ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = nyc_predict_clean$nta_id,
breaks = 1:nrow(nyc_predict_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
tidy(negbin_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 5.871170e-02 | 2.2911940 | 0.0035598 | 1.023883e+00 |
| gini | 4.268212e+05 | 4.3865093 | 1551.1395970 | 4.938351e+07 |
| asian_count | 5.624600e-03 | 0.0000349 | 0.0017718 | 9.856000e-03 |
| transportation_desert_4catLimited | 2.305360e+02 | 0.4389344 | 97.0243687 | 4.840791e+02 |
| transportation_desert_4catSatisfactory | 7.448618e+01 | 0.4333771 | 9.7214513 | 2.025286e+02 |
| transportation_desert_4catExcellent | 2.902775e+02 | 0.4976055 | 120.8134610 | 5.857729e+02 |
| store_count | 1.091114e+00 | 0.0081662 | 0.0735059 | 2.042746e+00 |
| bus_count | 6.415640e-01 | 0.0052005 | 0.0561314 | 1.435678e+00 |
col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
moran.mc(col_sp$sex_crime_count, listw = col_listw, nsim = 999, alternative = "greater")
##
## Monte-Carlo simulation of Moran I
##
## data: col_sp$sex_crime_count
## weights: col_listw
## number of simulations + 1: 1000
##
## statistic = -0.04102, observed rank = 172, p-value = 0.828
## alternative hypothesis: greater
col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
M.burnin <- 10000 # Number of burn-in iterations (discarded)
M <- 1000 # Number of iterations retained
model.sex <- S.CARleroux(
sex_crime_count ~ 1 + total_pop + gini + below_poverty_line_count,
data = nyc_clean,
family = "poisson",
W = W,
burnin = M.burnin,
n.sample = M.burnin + M, # Total iterations
verbose = FALSE)
as.data.frame(model.sex$summary.results) %>%
rownames_to_column("term") %>%
kable()%>%
kable_styling() %>%
scroll_box(width = "100%", height = "200px")
| term | Median | 2.5% | 97.5% | n.sample | % accept | n.effective | Geweke.diag |
|---|---|---|---|---|---|---|---|
| (Intercept) | -2.5346 | -3.5162 | -1.4237 | 1000 | 44.9 | 8.3 | -2.0 |
| total_pop | 0.0000 | 0.0000 | 0.0000 | 1000 | 44.9 | 1.3 | 1.2 |
| gini | 5.1898 | 1.8197 | 7.2130 | 1000 | 44.9 | 6.7 | 0.6 |
| below_poverty_line_count | 0.0000 | 0.0000 | 0.0001 | 1000 | 44.9 | 3.4 | 3.9 |
| tau2 | 2.1539 | 1.7194 | 2.7787 | 1000 | 100.0 | 215.7 | 0.2 |
| rho | 0.0116 | 0.0006 | 0.0481 | 1000 | 46.5 | 101.4 | 0.6 |
p_plot <- round((moran.mc(x = as.vector(model.sex$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
nyc_clean %>%
mutate(resid = model.sex$residuals$response) %>%
ggplot(aes(fill = resid), color = "#8f98aa") +
geom_sf()+
scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
guide_legend(title = "Residual")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle(paste0("Sex-Based Violence: \nPoisson Model Residuals (", p_plot, ")"))+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 25, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
M.burnin <- 10000 # Number of burn-in iterations (discarded)
M <- 1000 # Number of iterations retained
model.sex <- S.CARleroux(
sex_crime_count ~ 1 + total_pop + gini + below_poverty_line_count,
data = nyc_clean,
family = "zip",
formula.omega = ~1,
W = W,
burnin = M.burnin,
n.sample = M.burnin + M, # Total iterations
verbose = FALSE)
as.data.frame(model.sex$summary.results) %>%
rownames_to_column("term") %>%
kable()%>%
kable_styling() %>%
scroll_box(width = "100%", height = "200px")
| term | Median | 2.5% | 97.5% | n.sample | % accept | n.effective | Geweke.diag |
|---|---|---|---|---|---|---|---|
| (Intercept) | -2.0063 | -4.1039 | -0.7230 | 1000 | 40.1 | 5.3 | -9.5 |
| total_pop | 0.0000 | 0.0000 | 0.0000 | 1000 | 40.1 | 3.1 | -3.0 |
| gini | 4.8409 | 2.4735 | 9.9535 | 1000 | 40.1 | 5.1 | 11.7 |
| below_poverty_line_count | 0.0001 | 0.0000 | 0.0001 | 1000 | 40.1 | 2.5 | 0.4 |
| omega - (Intercept) | -12914.7514 | -12914.7514 | -12914.7514 | 1000 | 0.0 | 0.0 | NaN |
| tau2 | 2.2393 | 1.7625 | 2.9347 | 1000 | 100.0 | 155.8 | -0.9 |
| rho | 0.0076 | 0.0001 | 0.0473 | 1000 | 40.4 | 92.9 | -1.1 |
p_plot <- round((moran.mc(x = as.vector(model.sex$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
nyc_clean %>%
mutate(resid = model.sex$residuals$response) %>%
ggplot(aes(fill = resid), color = "#8f98aa") +
geom_sf()+
scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
guide_legend(title = "Residual")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle(paste0("Sex-Based Violence: \nZIP Model Residuals (", p_plot, ")"))+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 25, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
table3 <- prediction_summary(model=poisson_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Hierarchichal Poisson")
table4 <- prediction_summary(model=negbin_hierarchical, data=nyc_clean %>% na.omit())%>%
mutate(Model = "Hierarchichal Negative Binomial")
rbind(table1, table2, table3, table4) %>%
dplyr::mutate(model = Model, .before=1) %>%
dplyr::select(-Model)%>% kable() %>% kable_styling()
| model | mae | mae_scaled | within_50 | within_95 |
|---|---|---|---|---|
| Poisson | 9.8175 | 2.4370585 | 0.1000000 | 0.4666667 |
| Negative Binomial | 13.1000 | 0.5442446 | 0.5666667 | 0.9833333 |
| Hierarchichal Poisson | 9.7000 | 0.9074845 | 0.4833333 | 0.6833333 |
| Hierarchichal Negative Binomial | 13.1975 | 0.5221313 | 0.5833333 | 1.0000000 |
Using in-sample scaled MAE, it’s evident our negative binomial regression models, regardless of hierarchy, performed better than our poisson regression models. However, the differences between the two negative binomial models was neglible. So, we will more closely interpret the non-hierarchichal negative binomial regression model.
tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 0.1412859 | 1.6773478 | 0.0205149 | 1.487618e+00 |
| gini | 83112.5360342 | 3.3742642 | 699.4828738 | 5.658686e+06 |
| mean_rent | -0.0690438 | 0.0005387 | -0.1351984 | -4.143100e-03 |
| latinx_count | 0.0047039 | 0.0000287 | 0.0007911 | 7.644200e-03 |
| asian_count | 0.0068104 | 0.0000284 | 0.0029518 | 9.905500e-03 |
| transportation_desert_4catLimited | 227.3774554 | 0.4375992 | 88.3039435 | 4.952490e+02 |
| transportation_desert_4catExcellent | 238.0095508 | 0.4822838 | 82.4135280 | 5.960894e+02 |
| store_count | 0.8100002 | 0.0069962 | 0.0412584 | 1.773830e+00 |
| uninsured_count | -0.0171355 | 0.0001329 | -0.0318871 | -1.290800e-03 |
After removing predictors whose 95% credible intervals included the possibility of non-effect when controlling for other covariates, we found that there were 6 remaining predictors of an arbitrary neighborhood’s sex-based crime counts:
- Mean income by neighborhood
- Unemployment counts by neighborhood
- Transportation desert status by neighborhood
- Food Vendor counts by neighborhood
- Bus station counts by neighborhood
- Eviction counts by neighborhood.
Next, we interpret each predictor:
- Mean Income: When controlling for all other predictors, 1 dollar increases in mean neighborhood rental prices are associated with approximately 0.00143% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.000330%, 0.00268%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
- Unemployment Count: When controlling for all other predictors, 1 dollar increases in unemployed people counts by neighborhood are associated with approximately 0.0493% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0140%, 0.0790%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Limited Subway Access: When controlling for all other predictors, a neighborhood with limited access to subway transit is expected to have approximately 125.753% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (51.508%, 269.541%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Satisfactory Subway Access: When controlling for all other predictors, a neighborhood with satisfactory access to subway transit is expected to have approximately 136.342% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (44.081%, 282.669%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Excellent Subway Access: When controlling for all other predictors, a neighborhood with excellent access to subway transit is expected to have approximately 98.186% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (15.762%, 275.969%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
- Food Vendor Count: When controlling for all other predictors, 1 store increases in the number of food vendors by neighborhood are associated with approximately 0.759% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.261%, 1.469%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Bus Stop Count: When controlling for all other predictors, 1 stop increases in the number of bus stop by neighborhood are associated with approximately 0.6516973% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0402706%, 1.1496164%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
- Eviction Count: When controlling for all other predictors, increases in the number of evictions by neighborhood are associated with approximately 0.340% decreases in the white population count. However, there is a 95% chance that the relationship may be any value between (-0.425%, -0.248%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
If we intended to essentialize this list,
transportation_desert_4,eviction_count,store_countseem to be the most informative predictors.
We intend to adjust our model specifications and reconsider the construction and inclusion of our predictors. For example, we may want to rethink how we’re making our transit-access variable, whether including car ownership or transit use, and what predictors we’re specifically using in our models.
Moving forward, we want to build some non-hierarchical models predicting subway and bus count using the demographic data (like median income, rent, percentage white population, etc) to see the flip-side of what we are looking at right now with our models above.
Our current hierarchical models attempt to capture a spatial interaction by borough, but we may extend this hierarchy to do census-tracts within neighborhoods, so as to unnderstand observed differences at a finer level. Further, we may include latitude and longitude variables into our non-grouped model to account for spatial relationships and avoid hierarchy altogether. However, we may use
CARBayesto adjust for the spatial factors.
- Sam Ding: Pulled and prepared subway ridership and access data, then harmonized transit data with existing neighborhood data.
- Vichy Meas: Helped clean both transit and demographic data, scaffolded the checkpoint 3 responses, and helped plan out transit questions.
- Freddy Barragan: Pulled and prepared grocery, bus stop, school, eviction, and census data, made exploratory visualizations by neighborhood, built/coded our proposed demographic models.
- Juthi Dewan: Harmonized, cleaned, and joined all disparate datasets, prepared our data summaries, built/coded our proposed demographic models.